package is2.lemmatizer;

import is2.data.Cluster;
import is2.data.D4;
import is2.data.Instances;
import is2.data.InstancesTagger;
import is2.data.PipeGen;
import is2.data.SentenceData09;
import is2.io.CONLLReader09;
import is2.tools.IPipe;
import is2.util.DB;
import is2.util.OptionsSuper;
import is2.data.Long2Int;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map.Entry;




final public class Pipe extends PipeGen implements IPipe {

	
	private static final int _MIN_WORDS_MAPPED_BY_SCRIPT = 1;
	private static final int _MIN_OCCURENT_FOR_SCRIPT_USE = 4;

	private static final String _F0 = "F0";
	private static final String _F1 = "F1",_F2 = "F2",_F3 = "F3",_F4 = "F4",_F5 = "F5",_F6= "F6",_F7= "F7",_F8= "F8",_F9="F9",_F10 = "F10";
	private static final String _F11="F11",_F12="F12",_F13= "F13",_F14="F14",_F15="F15",_F16="F16",_F17="F17",_F18="F18",_F19="F19",_F20="F20";
	private static final String _F21="F21",_F22="F22",_F23= "F23",_F24="F24",_F25="F25",_F26="F26",_F27="F27",_F28="F28",_F29="F29",_F30="F30";
	private static final String _F31="F31",_F32="F32",_F33= "F33",_F34="F34",_F35="F35",_F36="F36",_F37="F37",_F38="F38",_F39="F39",_F40="F40";
	private static final String _F41="F41";

	private static int _f0,_f1,_f2,_f3,_f4,_f5,_f6,_f7,_f8,_f9,_f10,_f11,_f12,_f13,_f14,_f15,_f16,_f17,_f18,_f19,_f20;
	private static int _f21,_f22,_f23,_f24,_f25,_f26,_f27,_f28,_f29,_f30,_f31,_f32,_f33,_f34,_f35,_f36,_f37,_f38,_f39,_f41;
	public static int  _CEND,_swrd,_ewrd;

	public static final String MID = "MID", END = "END",STR = "STR",OPERATION = "OP";

	private CONLLReader09 depReader;


	public HashMap<String,String> opse = new HashMap<String, String> ();

	public String[] types;


	public MFO mf =new MFO();
	private D4 z, x; 


	Cluster cl;
	OptionsSuper options;
	Long2Int li;

	public Pipe (OptionsSuper options2, Long2Int l) {
		
		options=options2;
		li=l;
	}


	public InstancesTagger createInstances(String file)  {

		InstancesTagger is = new InstancesTagger();

		depReader = new CONLLReader09(CONLLReader09.NO_NORMALIZE);
		
		depReader.startReading(file);
		mf.register(REL,"<root-type>");
		mf.register(POS,"<root-POS>");


		System.out.print("Registering feature parts ");
		HashMap<String,Integer> ops = new HashMap<String, Integer> ();
		HashMap<String,HashSet<String>> op2form = new HashMap<String, HashSet<String>> ();
		int ic=0;
		int del=0;
		HashSet<String> rm = new HashSet<String> ();

		while(true) {
			SentenceData09 instance1 = depReader.getNext();
			if (instance1== null) break;
			ic++;
			if (ic % 100 ==0) {del = outValue(ic, del);}


			String[] labs1 = instance1.labels;
			for(int i1 = 0; i1 < labs1.length; i1++) {
				//typeAlphabet.lookupIndex(labs1[i1]);
				mf.register(REL, labs1[i1]);
			}

			String[] w = instance1.forms;
			for(int i1 = 0; i1 < w.length; i1++) {
				// saw the first time?
				if (mf.getValue(WORD,  w[i1].toLowerCase())==-1) 
					opse.put(instance1.forms[i1].toLowerCase(), instance1.lemmas[i1]);
				
				mf.register(WORD,  w[i1].toLowerCase());
			}
			for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD,  w[i1]);

			w = instance1.lemmas;
			for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD,  w[i1]);
			for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD,  w[i1].toLowerCase());

			w = instance1.plemmas;
			for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD,  w[i1]);
			for(int i1 = 0; i1 < w.length; i1++) mf.register(WORD,  w[i1].toLowerCase());


			for(int i1 = 0; i1 < w.length; i1++) registerChars(CHAR,  w[i1]);

			w = instance1.ppos;
			for(int i1 = 0; i1 < w.length; i1++) mf.register(POS,  w[i1]);

			w = instance1.gpos;
			for(int i1 = 0; i1 < w.length; i1++) mf.register(POS,  w[i1]);


			for(int i1 = 1; i1 < w.length; i1++)  {
				String op = getOperation(instance1, i1);
				if (ops.get(op)==null) ops.put(op, 1);
				else {
					ops.put(op, (ops.get(op)+1));
					if (ops.get(op)>4) rm.add(instance1.forms[i1].toLowerCase());
				}

				
				HashSet<String> forms = op2form.get(op);
				if (forms==null) {
					forms = new HashSet<String>();
					op2form.put(op, forms);
				}
				forms.add(instance1.forms[i1].toLowerCase());
				
			}

		}

		int countFreqSingleMappings =0;
		
		int sc=0;
		ArrayList<Entry<String, Integer>> opsl = new ArrayList<Entry<String, Integer>>();
		for(Entry<String, Integer> e : ops.entrySet()) {

			// do not use scripts for infrequent cases or frequent single mappings (der -> die)
			if(e.getValue()>_MIN_OCCURENT_FOR_SCRIPT_USE  &&  op2form.get(e.getKey()).size()>_MIN_WORDS_MAPPED_BY_SCRIPT) {
				mf.register(OPERATION, e.getKey());
				sc++;
				opsl.add(e);
			} else {
				// do not remove the infrequent cases
				rm.removeAll(op2form.get(e.getKey()));
				
				if (op2form.get(e.getKey()).size()<=1) countFreqSingleMappings+=op2form.get(e.getKey()).size();
			}
		}
		for(String k : rm) {
			opse.remove(k);
		}

		Collections.sort(opsl, new Comparator<Entry<String, Integer>>(){

			@Override
			public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) {

				return o1.getValue()==o2.getValue()?0:o1.getValue()>o2.getValue()?1:-1;
			}
		});

		
		
		for(Entry<String, Integer> e : opsl) {
			//	System.out.println(e.getKey()+"  "+e.getValue());
		}


		if (options.clusterFile==null)cl = new Cluster();
		else cl=  new Cluster(options.clusterFile, mf,6);


		System.out.println("\nfound scripts "+ops.size()+" used scripts "+sc);
		System.out.println("found mappings of single words "+countFreqSingleMappings);
		System.out.println("use word maps instead of scripts "+this.opse.size());
		//		System.out.println(" "+opse);
		System.out.println(""+mf.toString());

		initFeatures();

		mf.calculateBits();
		initValues();

		depReader.startReading(options.trainfile);

		int i = 0;
		long start1 = System.currentTimeMillis();

		System.out.print("Creating Features: ");
		is.init(ic, mf) ;
		del=0;
		while(true) {
			try {
			if (i % 100 ==0) {del = outValue(i, del);}
			SentenceData09 instance1 = depReader.getNext(is);
			if (instance1== null) break;
			
			is.fillChars(instance1, i, _CEND);
		
			if (i>options.count) break;

			i++;
		} catch(Exception e) {
			DB.println("error in sentnence "+i);
			e.printStackTrace();
		}
		}
		long end1 = System.currentTimeMillis();
		System.gc();
		long mem2 = Runtime.getRuntime().totalMemory() -  Runtime.getRuntime().freeMemory();
		System.out.print("  time "+(end1-start1)+" mem "+(mem2/1024)+" kb");

		types = new String[mf.getFeatureCounter().get(OPERATION)];

		for(Entry<String,Integer> e : mf.getFeatureSet().get(OPERATION).entrySet()) {
			types[e.getValue()] = e.getKey();
			//	System.out.println("set pos "+e.getKey());
		}

		System.out.println("Num Features: " + mf.size());



		return is;

	}


	/**
	 * @param is
	 * @param n
	 * @param k
	 * @param wds
	 * @return
	 */
	public static String getOperation(Instances is, int n, int k, String[] wds) {


		String form = wds[is.forms[n][k]];
		String olemma = wds[is.glemmas[n][k]];

		String s = new StringBuffer(form.toLowerCase()).reverse().toString();
		String t = new StringBuffer(olemma.toLowerCase()).reverse().toString();



		return getOperation2(s, t);
	}



	public static String getOperation(SentenceData09 instance1, int i1) {
		String s = new StringBuffer(instance1.forms[i1].toLowerCase()).reverse().toString();
		String t = new StringBuffer(instance1.lemmas[i1].toLowerCase()).reverse().toString();



		return getOperation2(s, t);
	}

	public static String getOperation(String si, String ti) {
		String s = new StringBuffer(si.toLowerCase()).reverse().toString();
		String t = new StringBuffer(ti.toLowerCase()).reverse().toString();



		return getOperation2(s, t);
	}


	private static String getOperation2(String s, String t) {
		StringBuffer po = new StringBuffer();
		String op;
		if (!s.equals(t)) {


			int[][] d =StringEdit.LD(s, t);
			StringEdit.searchPath(s,t,d, po, false);
			op = po.toString();

		} else op ="0"; // do nothing
		return op;
	}



	private void registerChars(String type, String word) {
		for(int i=0;i<word.length();i++) mf.register(type, Character.toString(word.charAt(i)));      
	}



	public void initValues() {

		z = new D4(li);
	
		x = new D4(li);
		x.a0=s_type; 

		s_pos = mf.getFeatureCounter().get(POS).intValue();//mf.getFeatureBits(POS);
		s_word = mf.getFeatureCounter().get(WORD);
		s_type = mf.getFeatureCounter().get(TYPE).intValue();//mf.getFeatureBits(TYPE);
		s_char = mf.getFeatureCounter().get(CHAR).intValue();//mf.getFeatureBits(CHAR);
		s_oper = mf.getFeatureCounter().get(OPERATION).intValue();//mf.getFeatureBits(OPERATION);

		types = new String[mf.getFeatureCounter().get(Pipe.OPERATION)];
		for(Entry<String,Integer> e : mf.getFeatureSet().get(Pipe.OPERATION).entrySet()) types[e.getValue()] = e.getKey();

		//wds  = new String[mf.getFeatureCounter().get(Pipe.WORD)];
		//for(Entry<String,Integer> e : mf.getFeatureSet().get(Pipe.WORD).entrySet()) wds[e.getValue()] = e.getKey();
		

		z.a0 = s_type;z.a1 = s_oper; z.a2 = s_char; z.a3 = s_char; z.a4 = s_char;z.a5 = s_char;z.a6 = s_char;z.a7 = s_char;
		x.a0 = s_type; x.a1 = s_oper;x.a2 = s_word; x.a3 = s_word; x.a4 = s_word;x.a5 = s_char;x.a6 = s_char;x.a7 = s_char;

	}

	public static int s_pos,s_word,s_type,s_dir,s_dist, s_char, s_oper;



	/**
	 * Initialize the features.
	 * @param maxFeatures
	 */
	public void initFeatures() {


		
		for(int k=0;k<50;k++) {
			mf.register(TYPE, "F"+k);
		}
		
		_f0 = mf.register(TYPE, _F0);
		_f1 = mf.register(TYPE, _F1);
		_f2 = mf.register(TYPE, _F2);
		_f3 = mf.register(TYPE, _F3);
		_f4 = mf.register(TYPE, _F4);
		_f5 = mf.register(TYPE, _F5);
		_f6 = mf.register(TYPE, _F6);
		_f7 = mf.register(TYPE, _F7);
		_f8 = mf.register(TYPE, _F8);
		_f9 = mf.register(TYPE, _F9);
		_f10 = mf.register(TYPE, _F10);
		_f11 = mf.register(TYPE, _F11);
		_f12 = mf.register(TYPE, _F12);
		_f13 = mf.register(TYPE, _F13);
		_f14 = mf.register(TYPE, _F14);
		_f15 = mf.register(TYPE, _F15);
		_f16 = mf.register(TYPE, _F16);
		_f17 = mf.register(TYPE, _F17);
		_f18 = mf.register(TYPE, _F18);
		_f19 = mf.register(TYPE, _F19);
		_f20 = mf.register(TYPE, _F20);
		_f21 = mf.register(TYPE, _F21);
		_f22 = mf.register(TYPE, _F22);
		_f23 = mf.register(TYPE, _F23);
		_f24 = mf.register(TYPE, _F24);
		_f25 = mf.register(TYPE, _F25);
		_f26 = mf.register(TYPE, _F26);
		_f27 = mf.register(TYPE, _F27);
		_f28 = mf.register(TYPE, _F28);
		_f29 = mf.register(TYPE, _F29);
		_f30 = mf.register(TYPE, _F30);

		_f31 = mf.register(TYPE, _F31);
		_f32 = mf.register(TYPE, _F32);
		_f33 = mf.register(TYPE, _F33);
		_f34 = mf.register(TYPE, _F34);

		_f35 = mf.register(TYPE, _F35);
		_f36 = mf.register(TYPE, _F36);
		_f37 = mf.register(TYPE, _F37);
		_f38 = mf.register(TYPE, _F38);


		mf.register(POS, MID);
		mf.register(POS, STR);
		mf.register(POS, END);
		mf.register(TYPE, CHAR);

		_swrd = mf.register(WORD, STR);
		_ewrd = mf.register(WORD, END);


		_CEND = mf.register(CHAR, END);


	}



	final public void addCoreFeatures(InstancesTagger is, int ic, int i, int oper, String form, long[] f) {

		for(int l=f.length-1;l>=0;l--) f[l]=0;
		
		int formi =is.forms[ic][i];
		int wl =is.chars[ic][i][11];//.forms[i].length();

		int position = 1+(i<3?i:3);

		int c0= is.chars[ic][i][0], c1=is.chars[ic][i][1], c2=is.chars[ic][i][2], c3=is.chars[ic][i][3], c4=is.chars[ic][i][4],c5=is.chars[ic][i][5];
		int e0 =is.chars[ic][i][6], e1 =is.chars[ic][i][7],e2 =is.chars[ic][i][8],e3 =is.chars[ic][i][9],e4 =is.chars[ic][i][10];

		int len = is.length(ic);

	
		
		x.v1=oper; x.v0 = _f0; x.v2 = formi; x.cz3(); f[0]=x.getVal(); f[1]=x.csa(3, position);
		x.v0 = _f1; x.v2 = formi; x.v3 =i+1>=len?x.v3=_ewrd:is.forms[ic][i+1];x.cz4(); f[2]=x.getVal();

		// contains upper case include again!!!
		
		short upper =0;
		short number = 1;
		for(int k1=0;k1<wl;k1++){
			char c =form.charAt(k1);
			if (Character.isUpperCase(c)) {
				if (k1==0) upper=1;
				else {
					// first char + another
					if (upper==1)upper=3;
					// another uppercase in the word
					else if (upper==0) upper=2;
				}
			}

			if (Character.isDigit(c) && k1==0) number =2 ;
			else if (Character.isDigit(c) && number==1) number = 3 ;

		}

		// contains a number
		z.v0= _f21;	 z.v2=number;  z.cz3();f[3]=z.getVal();

		z.v0 = _f4; z.v1 = oper; z.v2=c0; z.cz3();f[4]=z.getVal();
		z.v0 = _f5; z.v2 = e0;z.cz3();f[5]=z.getVal();

		z.v2=c0; z.v3=c1; z.v4=c2; z.v5=c3; z.v6=c4;
		z.v0=_f6; z.cz4(); f[6]=z.getVal();
		z.v0=_f7; z.cz5(); f[7]=z.getVal();
		z.v0=_f8; z.cz6(); f[8]=z.getVal();
		z.v0=_f9; z.cz7(); f[9]=z.getVal();
	
		int c=10;
		z.v2=e0; z.v3=e1; z.v4=e2; z.v5=e3; z.v6=e4;
		z.v0 =_f10; z.cz4();f[c++]=z.getVal(); f[c++]= z.csa(3, upper); 
		z.v0 =_f11; z.cz5();f[c++]=z.getVal(); f[c++]= z.csa(3, upper); 
		z.v0 =_f12; z.cz6();f[c++]=z.getVal(); f[c++]= z.csa(3, upper); 
		z.v0 =_f13; z.cz7();f[c++]=z.getVal(); f[c++]= z.csa(3, upper); 
		
		if (len>i+1) {

			z.v0 = _f14;  z.v2 = is.chars[ic][i+1][0];
			z.cz3();f[c++]=z.getVal();

			z.v0 = _f15;  z.v2 = is.chars[ic][i+1][5];z.cz3();f[c++]=z.getVal();

			if (is.chars[ic][i+1][11]>1 ) {
				z.v0 = _f16;  z.v2 = is.chars[ic][i+1][0];
				z.v3 = is.chars[ic][i+1][2];z.cz4();f[c++]=z.getVal();

				z.v0 = _f17;  z.v2 = is.chars[ic][i+1][1];
				z.v3 = is.chars[ic][i+1][6];
				z.cz4();f[c++]=z.getVal();//fv.add(li.l2i(mf.calc4(b)));
			}


			x.v0 = _f18;  
			x.v2 =  is.forms[ic][i+1];
			x.cz3();f[c++]=x.getVal();

			if (len>i+2) {
				x.v0 = _f32;  
				x.v2 =  is.forms[ic][i+2]; 	x.v3 =  is.forms[ic][i+1]; x.cz4();f[c++]=x.getVal();
				x.cz3();f[c++]=x.getVal();//fv.add(li.l2i(mf.calc3(b)));

			}

			if (len>i+3) {
				x.v0 = _f33;  x.v2 =  is.forms[ic][i+3]; x.v3 =  is.forms[ic][i+2];x.cz4();f[c++]=x.getVal();//fv.add(li.l2i(mf.calc4(b)));
				x.cz3();f[27]=x.getVal();//fv.add(li.l2i(mf.calc3(b)));
			}
		}

		// length

		z.v0= _f19;	z.v1=oper; z.v2=wl;z.cz3();f[c++]=z.getVal();//fv.add(li.l2i(mf.calc3(dl1)));

		if (i<1) return ;

		x.v0 = _f27; x.v1=oper;
		x.v2 = is.forms[ic][i-1];x.cz3();f[c++]=x.getVal();//fv.add(li.l2i(mf.calc3(b)));


		if (i<2) return ;

		//added this before it was 99.46
		x.v0 = _f28; x.v2 = is.forms[ic][i-2];x.cz3();f[c++]=x.getVal();//fv.add(li.l2i(mf.calc3(b)));

		// result 99.484
		if (i<3) return ;

		x.v0 = _f31; x.v1=oper; x.v2 = is.forms[ic][i-3]; x.v3 = is.forms[ic][i-2]; x.cz4();f[c++]=x.getVal();//fv.add(li.l2i(mf.calc4(b)));

	}



//	public String[] wds;

	/**
	 * Write the lemma that are not mapped by operations
	 * @param dos
	 */
	private void writeMap(DataOutputStream dos) {

		try {
			dos.writeInt(opse.size());
			for(Entry<String, String> e : opse.entrySet()) {
				dos.writeUTF(e.getKey());
				dos.writeUTF(e.getValue());
			}
		} catch (IOException e1) {
			e1.printStackTrace();
		}
	}



	/**
	 * Read the form-lemma mapping not read by operations
	 * @param dis
	 */
	public void readMap(DataInputStream dis) {
		try {
			int size = dis.readInt();
			for(int i =0; i<size;i++) {
				opse.put(dis.readUTF(), dis.readUTF());
			}
		} catch (IOException e1) {
			e1.printStackTrace();
		}
	}






	/* (non-Javadoc)
	 * @see is2.tools.IPipe#write(java.io.DataOutputStream)
	 */
	@Override
	public void write(DataOutputStream dos) {
		this.writeMap(dos);
		try {
			cl.write(dos);
		} catch (IOException e) {
			e.printStackTrace();
		}
		
	}



}
